#! /usr/bin/env python3
#
# vector_tutorial.py -- Example of converting query results to vectors.
#
# The AtomSpace provides a powerful graph query engine that can extract
# subgraphs of arbitrary shape. Such query results are presented as
# rows, whereas many common python platforms prefer to work with
# columns. This example shows how to perform a basic query, and convert
# the results to columns.
#
# This result is written in python-Atomese. Of course, this is wrong:
# Atomese is designed to be written by algorithms, and not humans.
# It is awkward, hard-to-use and sometimes obtuse: this is on purpose,
# because Atomese is meant to be an 'assembly language' for graph
# rewriting systems, and not "yet another graph DB API for humans to
# use". However, in order to bootstrap up the chain, someone (i.e. you)
# have to understand enough Atomese to be able to debug autogenerated
# Atomese. Thus, the below is written in python, and performs abundant
# direct manipulations of Atoms, explicitly written out, to show what
# they look like. In the long run, you, the human, are NOT expected to
# write this stuff, or program this way. But, for a demo, this is
# necessary.
#
# As will become clear below, Atomese is a declarative language. One
# declares both data structures and code structures the same way: both
# code and data are represented with hypergraphs. Both code and data
# can be manipulated the same way: everything is a (hyper-)graph. Some
# hypergraphs are executable, and "do something", when executed. Others
# are static, and do nothing. In either case, the intended structure is
# directly visible and analyzable, as a (hyper-)graph.
#
# All of the non-python Atomese below is stored in the AtomSpace. It
# might look like code, and it will do things when you execute it, but
# it's also a graph. As a graph, it's storable, manipulable, embeddable.
# Every Atom below can be given a weight: it feels like code (because
# its executable) and it behaves like data - just another graph.
#
# ------------------------------------------------------------------
# Python setup.
from opencog.type_constructors import *

# ------------------------------------------------------------------
# Start by populating the AtomSpace with some data.
# For this example, the data will be very regular, for readability.
# (Thus, it could also be easily taken from some SQL database; the
# AtomSpace is designed for very irregular graphs, so the uniformity
# here hides what the AtomSpace is actually good for. No matter.
# Moving on ...)

tag = PredicateNode("word-pair")

# A standard dependency parse, showing word-pairs.
EdgeLink(tag, ListLink(ItemNode("the"), ItemNode("dog")))
EdgeLink(tag, ListLink(ItemNode("dog"), ItemNode("chased")))
EdgeLink(tag, ListLink(ItemNode("the"), ItemNode("cat")))
EdgeLink(tag, ListLink(ItemNode("chased"), ItemNode("cat")))
EdgeLink(tag, ListLink(ItemNode("chased"), ItemNode("around")))
EdgeLink(tag, ListLink(ItemNode("the"), ItemNode("house")))
EdgeLink(tag, ListLink(ItemNode("around"), ItemNode("house")))
EdgeLink(tag, ListLink(ItemNode("cat"), ItemNode("around")))
EdgeLink(tag, ListLink(ItemNode("HEAD"), ItemNode("dog")))
EdgeLink(tag, ListLink(ItemNode("HEAD"), ItemNode("chased")))

# In case you are wondering what that parse looks like "in read life",
# here it is:
#
#     +--------->WV-------->+--------MVp--------+
#     +---->Wd-----+        +-----Os-----+      +----Js----+
#     |      +Ds**c+--Ss*s--+      +Ds**c+--Mp--+    +Ds**c+
#     |      |     |        |      |     |      |    |     |
# LEFT-WALL the  dog.n chased.v-d the  cat.n around the house.n
#
# The above table just collapses each of the link types to a single
# PredicateNode("word-pair") The extra complexity would just make
# this demo more confusing. ('S' is subject, 'O' is object, 'D' is
# determiner. 'M' is a prepositional modifier, and 'J' is object of
# the preposition.

# ------------------------------------------------------------------
# Design a basic query pattern that can find the data above.
# The pattern just defines the "shape" of the data. The query defines
# the search itself, and what is to be done with the search results.
#
# A pattern that will find tagged word-pairs
pair_pattern = EdgeLink(tag,
    ListLink(VariableNode("$left-word"), VariableNode("$right-word")))

# A variable declaration for the pattern above. This is not strictly
# required, but is convenient to limit the scope of a query to variables
# of a given type. The types here are simple -- just ItemNodes. In
# general, typec can be arbitrarily complicated.
pair_vardecls = VariableList(
    TypedVariableLink(
        VariableNode("$left-word"), TypeNode("ItemNode")),
    TypedVariableLink(
        VariableNode("$right-word"), TypeNode("ItemNode")))

# Define a search for the above. It consists of the pattern, plus
# variable declarations for the (free) variables in the pattern
# (thus binding the variables.) The query is in the form of a rewrite
# rule: after matching the pattern, the variables can be used to
# define/create some new structure. For the demo below, a trivial
# rewrite is done: the initial search pattern is just echoed back.
basic_query = QueryLink(
    # The variable declarations that were constructed earlier.
    pair_vardecls,

    # The PresentLink asks that the search pattern is present in
    # the AtomSpace. One or more search terms can be combined,
    # including a spec to insist that some term be *absent*, for
    # the query to match.
    PresentLink(pair_pattern),

    # Output what was found. This just repeats the pattern.
    pair_pattern)

# Perform the actual query. This is where the CPU time gets soaked up.
# For this simple demo, just milliseconds. For large datasets, maybe
# 25K to 150K queries per second (single-threaded), depending on the
# complexity of the search pattern and the actual dataset.
basic_query.execute()

# Verify the query results by printing them out. The query results are
# cached at a location given by using the query itself as the key. The
# cached results can be accessed at any time. Rerunning the query will
# update the cache.
#
# The ValueOfLink(atom, key), when executed, will return the Value
# on `atom` located at `key`. Below, the basic_query is used as it's
# own key: it's the "well-known location" that can always be found.
print("Basic query returned:",
    ValueOfLink(basic_query, basic_query).execute())
print("")

# ------------------------------------------------------------------
# Design a more interesting query. This one will count the number of
# times that words appear on the left or right side of a pair. The
# resulting counts will be stored with the words (for later access).

# Define a key where the counts will be placed. The key can be any
# Atom at all, but by convention, PredicateNodes are used. The naming
# convention was inspired by first-order predicate logic.
count_key = PredicateNode("counter")

# Define the word-counting query. The variable declarations and search
# pattern are same as before. The rewrite side of the query has two
# parts, incrementing different counts on a very short vector.
#
# The IncrementValueLink Atom is a thread-safe, atomic increment. It can
# be run from multiple threads at once, and will not race in reading,
# incrementing and writing the count.
#
# Two counts are kept: one for the left-words, and one for the
# right-words. These two counts could be kept under separate keys.
# However, to make the demo more interesting, these two counts are
# stored under just one key, but using a FloatValue vector to store
# them. Thus, the increment is performed not a single number, but on
# the whole vector, at once. The counts on the left-words are
# incremented with [1,0], and those on the right with [0,1]. Since
# increment is atomic, the counts will be accurate.
#
# In general, the increment vector can be of any length, holding
# arbitrary floating-point values. Try it.
counting_query = QueryLink(
    pair_vardecls,
    PresentLink(pair_pattern),

    # When the above pattern is matched, the counts are incremented.
    IncrementValueLink(VariableNode("$left-word"),
       count_key, NumberNode([1,0])),
    IncrementValueLink(VariableNode("$right-word"),
       count_key, NumberNode([0,1])))

# Perform the counting.
counting_query.execute()

# ------------------------------------------------------------------
# To view the count results, we'll have a bit more fun. Construct a
# query that crawls over words, only. Just for grins, a MeetLink is
# used, instead of a QueryLink. The MeetLink does NOT perform any
# rewrites: it just returns the goundings of the variable(s), directly.
# The name "Meet" come from the fact that this query performs a meet
# on a poset. See Wikipedia:
#    https://en.wikipedia.org/wiki/Join_and_meet
# Yes, Atomese also has a JoinLink.
#
get_words = MeetLink (
    TypedVariableLink(VariableNode("$word"), TypeNode("ItemNode")),
    PresentLink(VariableNode("$word")))

print("The set of words is:", get_words.execute())
print("")

# The above will print a list of all of the words found by that query.
# Notice that these are wrapped with a UniSetValue. The UniSet is a
# thread-safe producer-consumer deduplicating set. That means that one
# or more threads can write to it, while other threads remove content
# from it.  In this case, the query is done in an eyeblink, so there is
# no point in dealing with the complexity of multiple threads.
#
# The name UniSet comes from the fact that each entry in the set is
# unique: adding the same Atom a second time does nothing. The set
# contents are de-duplicated.
#
# There is also a QueueValue. This is a thread-safe producer-consumer
# FIFO. Writers add to the tail, readers read from the head. Both the
# UniSet and the Queue block when empty, so that reader threads wait
# until something shows up for them to work on.

# ------------------------------------------------------------------
# Values are C++ vectors. Thus, FloatValue is just std::vector<double>
# and StringValue is just std::vector<std::string>. LinkValue's are
# vectors of pointers to Values or Atoms. These convert naturally to
# python lists.

# Get the cached UniSet from the earlier query:
the_unique_set_object = ValueOfLink(get_words, get_words).execute()

# Unwrap it, into a python list of Atoms
list_of_atoms = list(the_unique_set_object)
print("The list of word-atoms is", list_of_atoms)
print("")

# And a python list of strings:
list_of_word_strings = map(lambda wrd: wrd.name, list_of_atoms)
print("The list of word-strings is", list_of_word_strings)
print("")

# ------------------------------------------------------------------
# Anyway ... moving on. We want to see the counts on the words.
# A purely pythonic way of vewing these:
for wrd in list_of_atoms:
    print(f"The word \'{wrd.name}\' has count {wrd.get_value(count_key) }")

print("")

# We want to do the above, but this time, in a purely declarative,
# non-procedural style:
word_counts = QueryLink (
    TypedVariableLink(VariableNode("$word"), TypeNode("ItemNode")),
    PresentLink(VariableNode("$word")),

    # The counts were stored at count_key. So look for them there.
    ValueOfLink(VariableNode("$word"), count_key))

print("The left and right counts on the words are:", word_counts.execute())
print("")

# The above is a vector of rows. For GPU/compute processing, it is
# convenient to have two columns, instead of a bunch of rows. The
# TransposeColumn Atom, when executed, accomplishes this.
#
# The FloatValue is stored as a C++ std::vector<double> and so these
# numbers are stored in RAM, in a sequential array of 64-bit floats.
# Because it is a linear array, it can be directly sent to GPU units
# without any further copying or conversion.
#
# Here, we print this in python. In the real world, these would be
# millions of elements long, and would blow out python memory. That
# is, a 64-bit double in RAM is 8 bytes; a double in python is about
# 120 bytes. The RAM savings is significant.

two_columns = TransposeColumn(ValueOfLink(word_counts, word_counts))

print("Here are the left and right counts, again:", two_columns.execute())
print("")

# ------------------------------------------------------------------
# The above are just vectors. We've lost track of what words go with
# what. This can be a serious problem for the GPU processing of long
# vectors of weights. It would be best if each vector basis could be
# tagged with a universally unique ID (UUID).
#
# There are three ways of doing UUID's. One common way is to just issue
# an int. The problem with this is that there has to be a central
# issuing authority, otherwise different Atoms on different machines
# might get issued the same ID number. Even on the same machine, with
# multiple threads, the ID dispenser would need to be atomically locked.
# To add injury to insult, a table that associates ID numbers to Atoms
# has to be maintained. This chews up RAM, and RAM is already a precious
# commodity. So, in general, issuing UUID serial numbers is just a bad
# idea.
#
# A second common way of doing UUID's is via decentralized crypto
# techniques. The Atom can be hashed. That hash is universally
# computable: even someone on Pluto will get exactly the same hash.
# This solves the centralization problem. The problem with crypto hashes
# is the "birthday paradox" (see Wikipedia, if unclear). Hash collisions
# will occur, if the hash is not big enough. 256-bit or 512 bit hashes
# are effectively safe (although SHA-256 has been broken.) However,
# 256 bits works out to 32 bytes, which is ... a lot. Another problem
# is that a globally-distributed lookup table is needed. If you are
# given a hash, there's no way of guessing what Atom it might be, based
# on the hash: hashes are not invertable; they are one-way functions.
# Thus, global distribution, and more RAM is needed.
#
# The third solution is to just use the string name of the Atom. It is
# globally unique: there can only ever be one Atom of that name and
# type. There's no confusion about what Atom it is: one can convert
# Atoms to strings, and back again, trivially.  In most cases, strings
# are short. So, '(Item "foobar")' is 15 bytes long. One can do even
# better with compression; although a single string this long won't
# compress, collections of string names for thousands or millions of
# Atoms compress very very well; compression rates of about 7 bytes
# per Atom are seen in real-world datasets.
#
# Atoms can be converted to strings with the SexprColumn Link. This
# converts the Atom to an s-expression. We use s-expressions because
# they are more compact than JSON or YAML, and don't require significant
# whitespace the way python does. Also, s-expressions are very easy to
# parse: the matching parens are always unambiguous, and there is no
# need to track commas, semicolons or other punctuation.
#
# Just add it to the rewrite rule:

counts_and_names = QueryLink (
    TypedVariableLink(VariableNode("$word"), TypeNode("ItemNode")),
    PresentLink(VariableNode("$word")),

    # First, split out the left and right counts, each into their
    # own row. The `ElementOfLink` is just an array accessor. It
    # is just a very verbose way of subscripting an array.  That is,
    #    ElementOf(Number(42), foobar)
    # would just be
    #    foobar[42]
    # if it was written in python. Again, Atomese is not intended for
    # humans. This extra verbosity makes things very easy for
    # algorithms, results in easy parsing, and easy storage.
    # Reminder: this query is just a bunch of Atoms, and it is stored
    # in the AtomSpace, along with everything else.
    ElementOfLink(NumberNode(0), ValueOfLink(VariableNode("$word"), count_key)),
    ElementOfLink(NumberNode(1), ValueOfLink(VariableNode("$word"), count_key)),

    # Next, the names
    SexprColumn(VariableNode("$word")))

# Here we go:
print("Words and their counts:", counts_and_names.execute())
print("")

# And again, in columnar form:
three_columns = TransposeColumn(ValueOfLink(counts_and_names, counts_and_names))

print("And again, with columns:", three_columns.execute())
print("")

# ------------------------------------------------------------------
# THE END. That's All, Folks!
# ------------------------------------------------------------------
